
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import os
import numpy as np
import pandas as pd


"""
objective: construct dataset family-level backward citations
classify those citations by the tech and type of firms
start with backward citations of the bat families of OEMs
1) get docdb ids of bat families of OEMs: C_PatentVariables/Families_of_OEMs_cpc_ipc.csv
2) get all the docdb ids cited (backward docdb ids) and date of citation
3) get the tech types of those cited families (using /C_PatentVariables/Families_of_ipc_cpc_transpo.csv)
4) add list of NAICS codes associated with the firms that filed the cited families
get the bvdid of the firms associated with the families
then get the naics codes of those firms from Orbis
5) add the NACE code that PATSTAT assigned to the cited family (dummy if motor sector)
6) add dummy if the cited family is from self, another OEM (including subsidiaries)
"""


def main():
    LOGGER.info('********************* i_classifying_forward_and_backward_citations.py ************************')
    for type_citation in ['forward', 'backward']:
        LOGGER.info(f'********************* {type_citation} ************************')
        main_type_citation(type_citation)
        output_timeseries_for_type_citation(type_citation, subset='')
    output_timeseries_for_type_citation(type_citation, subset='OEMs')



def main_type_citation(type_citation):
    BatFC_Fam = get_BatFCfamilies()
    BatFC_Fam = get_forward_or_backward_citations(BatFC_Fam, type_citation)
    BatFC_Fam = add_techtype_ofthe_forward_or_backward_citations(BatFC_Fam, type_citation)
    BatFC_Fam = add_bvdids_and_NAICS_of_firms_citing_or_cited(BatFC_Fam, type_citation)
    BatFC_Fam = add_dummies(BatFC_Fam, type_citation)
    BatFC_Fam.to_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Citations_{type_citation}_of_AllBatFC_Families.csv', index=False)
    LOGGER.info(f'Citations_{type_citation}_of_AllBatFC_Families.csv    SAVED')


def get_BatFCfamilies():
    data = pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo_cpc_ipc.csv')
    mask = data['Sub-sector'].str.contains('batteries') | data['Sub-sector'].str.contains('fuel cells')
    BatFC_Fam = data[mask & data['Sub-sector'].notnull()]
    BatFC_Fam = BatFC_Fam[['docdb_family_id', 'Sub-sector']].drop_duplicates()
    data = pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo.csv', usecols=['docdb_family_id', 'earliest_filing_year']).drop_duplicates()
    BatFC_Fam = BatFC_Fam.merge(data, on='docdb_family_id', how='left')
    return BatFC_Fam


def set_stubs(type_citation):
    init_stub, final_stub = '', ''
    if type_citation == 'forward':
        init_stub, final_stub = 'cited', 'citing'
    elif type_citation == 'backward':
        init_stub, final_stub = 'citing', 'cited'
    return init_stub, final_stub


def get_forward_or_backward_citations(BatFC_Fam, type_citation):
    init_stub, final_stub = set_stubs(type_citation)
    BatFC_Fam = BatFC_Fam.rename(columns={'docdb_family_id': f'{init_stub}_docdb_family_id', 'Sub-sector': f'Sub-sector_{init_stub}', 'earliest_filing_year': f'earliest_filing_year_{init_stub}'})
    listfiles = [k for k in os.listdir(PATHS.PATSTAT / 'PATSTATGlobal2022_derivatives/citations') if 'docdbid_citations_date' in k]
    dfcitations = []
    for file in listfiles:
        for chunk in pd.read_csv(PATHS.PATSTAT / 'PATSTATGlobal2022_derivatives/citations' / file, chunksize=1000000):
            dfcitations.append(chunk[chunk[f'{init_stub}_docdb_family_id'].isin(BatFC_Fam[f'{init_stub}_docdb_family_id'].tolist())])
    dfcitations = pd.concat(dfcitations)
    dfcitations = dfcitations[[f'{init_stub}_docdb_family_id', f'{init_stub}_appln_filing_date', f'{final_stub}_docdb_family_id', f'{final_stub}_appln_filing_date']]
    BatFC_Fam = BatFC_Fam.merge(dfcitations, on=f'{init_stub}_docdb_family_id', how='left')
    return BatFC_Fam


def add_techtype_ofthe_forward_or_backward_citations(BatFC_Fam, type_citation):
    init_stub, final_stub = set_stubs(type_citation)
    data = pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo_cpc_ipc.csv')
    BatFC_Fam = BatFC_Fam.merge(data, left_on=f'{final_stub}_docdb_family_id', right_on='docdb_family_id', how='left')
    BatFC_Fam = BatFC_Fam.drop(columns=['docdb_family_id'])
    return BatFC_Fam


def add_bvdids_and_NAICS_of_firms_citing_or_cited(BatFC_Fam, type_citation):
    init_stub, final_stub = set_stubs(type_citation)
    PatstatLinks = pd.read_csv(PATHS.privatedata / 'Patstat_orbis_correspondence/Orbis_PATSTAT_updatePEM_anonymized.csv', usecols=['bvdid', 'docdb_family_id']).drop_duplicates()
    PatstatLinks = PatstatLinks[PatstatLinks['docdb_family_id'].isin(BatFC_Fam[f'{final_stub}_docdb_family_id'])]
    BatFC_Fam = BatFC_Fam.merge(PatstatLinks.groupby('docdb_family_id').nunique(), left_on=f'{final_stub}_docdb_family_id', right_on='docdb_family_id', how='left')
    list_bvdids = PatstatLinks['bvdid'].drop_duplicates()
    list_bvdids.to_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Citations_{type_citation}_bvdids_of_firms_{final_stub}_BatFC_families.csv', index=False)
    """
    MANUAL STEP HERE!!!
    Get naics code for these bvdids in orbis - via orbis platform.
    Start a search by bvdids and upload the csv saved above. 
    Then export search results from Orbis and save csv under PATHS.manualorbis
    --> The manually outputted files were called "NAICS_bvdids_of_families_{final_stub}_BatFCfam.xlsx"
    Orbis Search Results:
    For forward citations: 57703 BvD ID numbers have been found // 1379 BvD ID numbers could not be found
    For backward citations: 80216 BvD ID numbers have been found // 2392 BvD ID numbers could not be found
    """
    dfnaics = pd.read_excel(PATHS.manualorbis / f'NAICS_bvdids_of_families_{final_stub}_BatFCfam.xlsx', sheet_name='Results')
    dfnaics = dfnaics[['BvD ID number', 'NAICS 2017, core code (4 digits)', 'NAICS 2017, core code - description']]
    dfnaics = dfnaics[dfnaics['BvD ID number'].notnull()].drop_duplicates()
    dfnaics = dfnaics[dfnaics['NAICS 2017, core code (4 digits)'].notnull()].drop_duplicates()
    dfnaics['NAICScore3digit'] = dfnaics['NAICS 2017, core code (4 digits)'].astype(str).str[:3]
    PatstatLinks = PatstatLinks.merge(dfnaics, left_on='bvdid', right_on='BvD ID number', how='inner')
    dfFamNaics = PatstatLinks.groupby('docdb_family_id')['NAICScore3digit'].unique()
    dfFamNaics = dfFamNaics.apply(lambda x: ','.join(x))
    BatFC_Fam = BatFC_Fam.merge(dfFamNaics, left_on=f'{final_stub}_docdb_family_id', right_on='docdb_family_id', how='left')
    BatFC_Fam['NAICScore3digit'] = BatFC_Fam['NAICScore3digit'].fillna('')
    for naics in ['336', '335', '334', '333', '541']:
        BatFC_Fam[naics] = BatFC_Fam['NAICScore3digit'].apply(lambda x: True if (naics in x) else False)
    return BatFC_Fam


def add_dummies(BatFC_Fam, type_citation):
    init_stub, final_stub = set_stubs(type_citation)
    for k in ['OEMs', 'subsidiaries', 'suppliers', 'nace_motor', 'ipc_cpc_transpo']:
        data = pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Families_of_{k}.csv', usecols=['docdb_family_id'])
        BatFC_Fam[k] = BatFC_Fam[f'{final_stub}_docdb_family_id'].isin(data['docdb_family_id'].tolist())
    return BatFC_Fam



def output_timeseries_for_type_citation(type_citation, subset='OEMs'):
    init_stub, final_stub = set_stubs(type_citation)
    dfcitations = pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Citations_{type_citation}_of_AllBatFC_Families.csv')
    dfcitations['Year'] = dfcitations[f'{init_stub}_appln_filing_date'].apply(lambda x: int(x[:4]) if type(x) == str else x)
    mask_hascitations = dfcitations[f'{final_stub}_docdb_family_id'].notnull()
    maskyear = dfcitations['Year'].isin(range(1990, 2018))
    maskBatorFC = dfcitations[f'Sub-sector_{init_stub}'].str.contains('batteries')
    if subset == 'OEMs':
        data1 = pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Families_of_OEMs.csv', usecols=['docdb_family_id'])
        data2 = pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Families_of_subsidiaries.csv', usecols=['docdb_family_id'])
        data = pd.concat([data1, data2])
        maskOEMs = dfcitations['citing_docdb_family_id'].isin(data['docdb_family_id'])
    else:
        maskOEMs = dfcitations['citing_docdb_family_id'].notnull()
    # BATTERY
    df_cit_year_bat = construct_timeseries(dfcitations[maskBatorFC & mask_hascitations & maskyear & maskOEMs], type_citation)
    df_cit_year_bat.to_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Citations_{type_citation}_Timeseries_battery{subset}.csv', index=True)
    # FUEL CELLS
    maskBatorFC = dfcitations[f'Sub-sector_{init_stub}'].str.contains('fuel cells')
    df_cit_year_bat = construct_timeseries(dfcitations[maskBatorFC & mask_hascitations & maskyear & maskOEMs], type_citation)
    df_cit_year_bat.to_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Citations_{type_citation}_Timeseries_fuelcell{subset}.csv', index=True)


def construct_timeseries(dfcit_tech, type_citation):
    """
    this function counts the number of backward citations of different types made in year t
    where year t is the application filing year of the citing family
    """
    init_stub, final_stub = set_stubs(type_citation)
    dfcit_tech['Year'] = dfcit_tech[f'{init_stub}_appln_filing_date'].str[:4].astype(int)
    df_cit_year = pd.DataFrame(dfcit_tech.groupby('Year')[f'{final_stub}_docdb_family_id'].count().rename('Total'))
    for type in ['batteries', 'electric vehicles', 'enabling technologies', 'energy storage', 'hybrid vehicles', 'smart grids', 'fuel cells', 'internal combustion engine', 'ice efficiency', 'car efficiency']:
        mask = dfcit_tech['Sub-sector'].str.contains(type) & dfcit_tech['Sub-sector'].notnull()
        typename = type.replace(' ', '')
        df_cit_year[typename] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
        df_cit_year[typename + 'pct'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count() / df_cit_year['Total']
    df_cit_year[f'Total{init_stub}'] = dfcit_tech.groupby('Year')[f'{init_stub}_docdb_family_id'].nunique()
    mask = dfcit_tech['Sub-sector'].isnull()
    df_cit_year['NoCPCenergy'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['NoCPCenergypct'] = df_cit_year['NoCPCenergy'] / df_cit_year['Total']
    # NAICS
    for naics in ['336', '335', '334', '333', '541']:
        mask = dfcit_tech['NAICScore3digit'].str.contains(naics) & dfcit_tech['NAICScore3digit'].notnull()
        df_cit_year[naics] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
        df_cit_year[naics + 'pct'] = df_cit_year[naics] / df_cit_year['Total']
    mask = dfcit_tech['NAICScore3digit'].isnull()
    df_cit_year['NoNaics'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['NoNaicspct'] = df_cit_year['NoNaics'] / df_cit_year['Total']
    mask1 = dfcit_tech['NAICScore3digit'].str.contains('|'.join(['336', '335', '334', '333', '541'])) & dfcit_tech['NAICScore3digit'].notnull()
    mask2 = dfcit_tech['NAICScore3digit'].isnull()
    mask = mask1 | mask2
    df_cit_year['NotTopNaics'] = dfcit_tech[~mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['NotTopNaicspct'] = df_cit_year['NotTopNaics'] / df_cit_year['Total']
    mask = ~(dfcit_tech['NAICScore3digit'].str.contains('336') & dfcit_tech['NAICScore3digit'].notnull()) & dfcit_tech['NAICScore3digit'].notnull()
    df_cit_year['OtherThan336'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['OtherThan336pct'] = df_cit_year['OtherThan336'] / df_cit_year['Total']

    mask = dfcit_tech['OEMs']
    df_cit_year['OEMs'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['OEMspct'] = df_cit_year['OEMs'] / df_cit_year['Total']
    mask = dfcit_tech['OEMs'] | dfcit_tech['subsidiaries']
    df_cit_year['OEMsSubsi'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['OEMsSubsipct'] = df_cit_year['OEMsSubsi'] / df_cit_year['Total']
    mask = dfcit_tech['suppliers']
    df_cit_year['suppliers'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['supplierspct'] = df_cit_year['suppliers'] / df_cit_year['Total']
    mask = dfcit_tech['nace_motor']
    df_cit_year['nace_motor'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['nace_motorpct'] = df_cit_year['nace_motor'] / df_cit_year['Total']
    mask = dfcit_tech['ipc_cpc_transpo']
    df_cit_year['ipc_cpc_transpo'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    df_cit_year['ipc_cpc_transpopct'] = df_cit_year['ipc_cpc_transpo'] / df_cit_year['Total']
    mask = dfcit_tech['OEMs'] & dfcit_tech['Sub-sector'].str.contains('Batteries') & dfcit_tech['Sub-sector'].notnull()
    df_cit_year['OEMs_Batteries'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    mask = dfcit_tech['OEMs'] & dfcit_tech['Sub-sector'].str.contains('Fuel Cells') & dfcit_tech['Sub-sector'].notnull()
    df_cit_year['OEMs_FC'] = dfcit_tech[mask].groupby('Year')[f'{final_stub}_docdb_family_id'].count()
    return df_cit_year

